{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pickle\n",
    "import pandas as pd\n",
    "\n",
    "base_dir = os.getcwd()\n",
    "\n",
    "# 原始数据\n",
    "atis_raw_train = os.path.join(base_dir, 'atis', 'atis.train.pkl')\n",
    "atis_raw_test = os.path.join(base_dir, 'atis', 'atis.test.pkl')\n",
    "\n",
    "# 处理后保存为csv文件\n",
    "atis_train_csv = os.path.join(base_dir, 'atis', 'atis.train.csv')\n",
    "atis_test_csv = os.path.join(base_dir, 'atis', 'atis.test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def load_atis(file_path):\n",
    "    with open(file_path, 'rb') as f_read:\n",
    "        ds,dicts = pickle.load(f_read)\n",
    "    print('done loading:', file_path)\n",
    "    print('samples:{}'.format(len(ds['query'])))\n",
    "    print('vocab_size:{}'.format(len(dicts['token_ids'])))\n",
    "    print('slot count:{}'.format(len(dicts['slot_ids'])))\n",
    "    print('intent count:{}'.format(len(dicts['intent_ids'])))\n",
    "    return ds,dicts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "done loading: E:\\project\\jupyter_workspace\\intent detection and slot filling\\atis\\atis.train.pkl\n",
      "samples:4978\n",
      "vocab_size:943\n",
      "slot count:129\n",
      "intent count:26\n",
      "train data process done!\n"
     ]
    }
   ],
   "source": [
    "'''\n",
    "    处理训练数据，保存为csv结构\n",
    "'''\n",
    "train_ds, train_dicts = load_atis(atis_raw_train)\n",
    "\n",
    "t2i, s2i, in2i = map(train_dicts.get, ['token_ids', 'slot_ids', 'intent_ids'])\n",
    "i2t, i2s, i2in = map(lambda d:{d[k]:k for k in d.keys()}, [t2i, s2i, in2i])\n",
    "\n",
    "query, slots, intent = map(train_ds.get, ['query', 'slot_labels', 'intent_labels'])\n",
    "\n",
    "train_source_target = []\n",
    "for i in range(len(train_ds['query'])):\n",
    "    intent_source_target_lst = []\n",
    "    \n",
    "    # 1.存储intent\n",
    "    intent_source_target_lst.append(i2in[intent[i][0]])\n",
    "    \n",
    "    # 2.存储source\n",
    "    source_data = list(' '.join(map(i2t.get, query[i])).split())\n",
    "    # 删除BOS\n",
    "    del(source_data[0])\n",
    "    # 删除EOS\n",
    "    del(source_data[-1])\n",
    "    intent_source_target_lst.append(' '.join(source_data))\n",
    "    \n",
    "    # 3.存储target\n",
    "    target_data = [i2s[slots[i][j]] for j in range(len(query[i]))]\n",
    "    # 删除BOS\n",
    "    del(target_data[0])\n",
    "    # 删除EOS\n",
    "    del(target_data[-1])\n",
    "    intent_source_target_lst.append(' '.join(target_data))\n",
    "    \n",
    "    train_source_target.append(intent_source_target_lst)\n",
    "\n",
    "name = ['intent', 'source', 'target']\n",
    "\n",
    "train_csv = pd.DataFrame(columns=name, data=train_source_target)\n",
    "train_csv.to_csv(atis_train_csv)\n",
    "\n",
    "print('train data process done!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "done loading: E:\\project\\jupyter_workspace\\intent detection and slot filling\\atis\\atis.test.pkl\n",
      "samples:893\n",
      "vocab_size:943\n",
      "slot count:129\n",
      "intent count:26\n",
      "test data process done!\n"
     ]
    }
   ],
   "source": [
    "'''\n",
    "    处理测试数据，保存为csv结构\n",
    "'''\n",
    "test_ds, test_dicts = load_atis(atis_raw_test)\n",
    "\n",
    "t2i, s2i, in2i = map(test_dicts.get, ['token_ids', 'slot_ids', 'intent_ids'])\n",
    "i2t, i2s, i2in = map(lambda d:{d[k]:k for k in d.keys()}, [t2i, s2i, in2i])\n",
    "\n",
    "query, slots, intent = map(test_ds.get, ['query', 'slot_labels', 'intent_labels'])\n",
    "\n",
    "test_source_target = []\n",
    "for i in range(len(test_ds['query'])):\n",
    "    intent_source_target_lst = []\n",
    "    \n",
    "    # 1.存储intent\n",
    "    intent_source_target_lst.append(i2in[intent[i][0]])\n",
    "    \n",
    "    # 2.存储source\n",
    "    source_data = list(' '.join(map(i2t.get, query[i])).split())\n",
    "    # 删除BOS\n",
    "    del(source_data[0])\n",
    "    # 删除EOS\n",
    "    del(source_data[-1])\n",
    "    intent_source_target_lst.append(' '.join(source_data))\n",
    "    \n",
    "    # 3.存储target\n",
    "    target_data = [i2s[slots[i][j]] for j in range(len(query[i]))]\n",
    "    # 删除BOS\n",
    "    del(target_data[0])\n",
    "    # 删除EOS\n",
    "    del(target_data[-1])\n",
    "    intent_source_target_lst.append(' '.join(target_data))\n",
    "    \n",
    "    test_source_target.append(intent_source_target_lst)\n",
    "\n",
    "name = ['intent', 'source', 'target']\n",
    "\n",
    "test_csv = pd.DataFrame(columns=name, data=test_source_target)\n",
    "test_csv.to_csv(atis_test_csv)\n",
    "\n",
    "print('test data process done!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>intent</th>\n",
       "      <th>source</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>flight</td>\n",
       "      <td>i want to fly from boston at 838 am and arrive...</td>\n",
       "      <td>O O O O O B-fromloc.city_name O B-depart_time....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>flight</td>\n",
       "      <td>what flights are available from pittsburgh to ...</td>\n",
       "      <td>O O O O O B-fromloc.city_name O B-toloc.city_n...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>flight_time</td>\n",
       "      <td>what is the arrival time in san francisco for ...</td>\n",
       "      <td>O O O B-flight_time I-flight_time O B-fromloc....</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>airfare</td>\n",
       "      <td>cheapest airfare from tacoma to orlando</td>\n",
       "      <td>B-cost_relative O O B-fromloc.city_name O B-to...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>airfare</td>\n",
       "      <td>round trip fares from pittsburgh to philadelph...</td>\n",
       "      <td>B-round_trip I-round_trip O O B-fromloc.city_n...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        intent                                             source  \\\n",
       "0       flight  i want to fly from boston at 838 am and arrive...   \n",
       "1       flight  what flights are available from pittsburgh to ...   \n",
       "2  flight_time  what is the arrival time in san francisco for ...   \n",
       "3      airfare            cheapest airfare from tacoma to orlando   \n",
       "4      airfare  round trip fares from pittsburgh to philadelph...   \n",
       "\n",
       "                                              target  \n",
       "0  O O O O O B-fromloc.city_name O B-depart_time....  \n",
       "1  O O O O O B-fromloc.city_name O B-toloc.city_n...  \n",
       "2  O O O B-flight_time I-flight_time O B-fromloc....  \n",
       "3  B-cost_relative O O B-fromloc.city_name O B-to...  \n",
       "4  B-round_trip I-round_trip O O B-fromloc.city_n...  "
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd_train = pd.read_csv(atis_train_csv, index_col=0)\n",
    "pd_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
